library("readr")
library("igraph")
library("dplyr")
library("stringr")
library("scales")
library("textreuse")
source("R/helper.R")
source("R/section-matches.R")
Read the data.
load("cache/corpus-lsh.rda")
Create a network graph based on section percentages.
edges_pct <- summary_matches %>%
filter(percent_borrowed >= 0.05,
!is.na(match_code)) %>%
select(borrower_code, match_code, weight = percent_borrowed) %>%
group_by(borrower_code) %>%
top_n(2, weight)
edges_pct
## Source: local data frame [136 x 3]
## Groups: borrower_code [83]
##
## borrower_code match_code weight
## (chr) (chr) (dbl)
## 1 AK1900 OR1862 0.5937
## 2 AR1868 KY1851 0.3634
## 3 AR1868 KY1854 0.3158
## 4 AR1874 AR1868 0.6752
## 5 AR1874 KY1851 0.0818
## 6 AZ1865 CA1851 0.5476
## 7 AZ1865 CA1858 0.2590
## 8 AZ1887 CA1872 0.4313
## 9 CA1850 NY1849 0.2972
## 10 CA1850 NY1850 0.1207
## .. ... ... ...
g <- graph_from_data_frame(edges_pct, directed = TRUE)
nodes <- distances(g, to = "NY1850", algorithm = "unweighted") %>% as.data.frame() %>%
add_rownames() %>%
rename(name = rowname, distance = NY1850) %>%
mutate(color = ifelse(distance == 0, "red",
ifelse(distance == 1, "green",
ifelse(distance == 2, "yellow", "lightblue"))))
## Warning in distances(g, to = "NY1850", algorithm = "unweighted"):
## Unweighted algorithm chosen, weights ignored
nodes[nodes$name == "NY1848", "color"] <- "red"
nodes[nodes$name == "NY1849", "color"] <- "red"
nodes[nodes$name == "NY1850", "color"] <- "red"
nodes[nodes$name == "NY1851", "color"] <- "red"
g <- graph_from_data_frame(edges_pct, directed = TRUE, vertices = nodes)
V(g)$year <- V(g)$name %>% extract_date()
set.seed(4221)
g <- add_layout_(g, with_graphopt(niter = 4000, spring.length = 25), normalize())
plot_before_year <- function(x, year) {
x_before <- induced.subgraph(x, which(V(x)$year <= year))
n <- V(x)$name
n_before <- V(x_before)$name
filter <- n %in% n_before
x_before$layout <- x_before$layout[filter, ]
par(mar = c(0,0,1,0))
plot(x_before, edge.width = E(x_before)$weight * 8,
edge.arrow.size = 0.0, vertex.size = 5)
title(paste0("Codes of Civil Procedure before ", year))
}
for (i in seq(1850, 1900, 5)) {
plot_before_year(g, i)
}
Create a graph based on numbers (not percentages) of sections shared. Notice that we are keeping only code to code matches that share a certain number of sections (minimum_n), we are keeping only a certain number of matches for each code (top_matches), and we are omitting codes that aren’t part of the main network.
minimum_n <- 20
top_matches <- 2
codes_not_to_plot <- c(
# "CO868",
# "CT1879",
# "FL1847",
# "FL1892",
"GA1851",
"GA1860",
"HI1859",
"HI1897",
# "IL1866",
"LA1825",
"LA1844"
# "MS1848",
# "MS1857",
# "NY1876",
# "NY1879",
# "VA1860",
# "VA1893",
# "WV1868"
)
# codes_not_to_plot <- NULL
edges_n <- summary_matches %>%
filter(!is.na(match_code),
sections_borrowed >= minimum_n) %>%
select(borrower_code, match_code, weight = sections_borrowed) %>%
group_by(borrower_code) %>%
top_n(top_matches, weight) %>%
ungroup() %>%
mutate(weight = rescale(weight)) %>%
filter(!borrower_code %in% codes_not_to_plot,
!match_code %in% codes_not_to_plot)
edges_n
## Source: local data frame [146 x 3]
##
## borrower_code match_code weight
## (chr) (chr) (dbl)
## 1 AK1900 OR1862 0.398192771
## 2 AK1900 NY1850 0.002409639
## 3 AK1900 OR1854 0.002409639
## 4 AR1868 KY1851 0.209036145
## 5 AR1868 KY1854 0.180120482
## 6 AR1874 AR1868 0.146987952
## 7 AR1874 KY1851 0.007228916
## 8 AZ1865 CA1851 0.199397590
## 9 AZ1865 CA1858 0.087951807
## 10 AZ1887 CA1872 0.203614458
## .. ... ... ...
g_n <- graph_from_data_frame(edges_n, directed = TRUE)
node_distances <- distances(g_n, to = c("NY1848", "NY1849", "NY1850", "NY1851"),
algorithm = "unweighted") %>%
apply(1, min, na.rm = TRUE)
## Warning in distances(g_n, to = c("NY1848", "NY1849", "NY1850", "NY1851"), :
## Unweighted algorithm chosen, weights ignored
nodes_n <- data_frame(name = names(node_distances), distance = node_distances) %>%
mutate(color = ifelse(distance == 0, "red",
ifelse(distance == 1, "green",
ifelse(distance == 2, "yellow", "lightblue"))))
g_n <- graph_from_data_frame(edges_n, directed = TRUE, vertices = nodes_n)
V(g_n)$year <- V(g_n)$name %>% extract_date()
edge_size_clamp <- function(g, multiplier = 20, max_val = 6, min_val = 1) {
w <- E(g)$weight * multiplier
w[w > max_val] <- max_val
w[w < min_val] <- min_val
w
}
set.seed(4221)
g_n <- g_n %>% add_layout_(with_graphopt(niter = 4000, spring.length = 25),
normalize())
par(mar = c(0,0,1,0))
plot(g_n, edge.width = edge_size_clamp(g_n), edge.arrow.size = 0, vertex.size = 5)
title("Borrowings between codes, number of sections borrowed")
Now do a state to state network:
min_state_borrowings <- 100
top_matches <- 2
edges_states <- summary_matches %>%
mutate(borrower_date = extract_date(borrower_code),
match_date = extract_date(match_code),
borrower_state = extract_state(borrower_code),
match_state = extract_state(match_code)) %>%
filter(!is.na(match_code),
borrower_date >= match_date,
borrower_state != match_state) %>%
group_by(borrower_state, match_state) %>%
summarize(n = sum(sections_borrowed)) %>%
filter(n >= min_state_borrowings) %>%
select(borrower_state, match_state, weight = n) %>%
group_by(borrower_state) %>%
top_n(top_matches, weight) %>%
ungroup() %>%
mutate(weight = rescale(weight))
edges_states
## Source: local data frame [44 x 3]
##
## borrower_state match_state weight
## (chr) (chr) (dbl)
## 1 AK OR 0.57655039
## 2 AR KY 0.60562016
## 3 AZ CA 0.75484496
## 4 CA NY 0.69670543
## 5 CO CA 0.05910853
## 6 CO IL 0.04166667
## 7 DC IN 0.11821705
## 8 DT ND 0.22674419
## 9 DT NE 0.19476744
## 10 FL NY 0.03488372
## .. ... ... ...
g_states <- graph_from_data_frame(edges_states, directed = TRUE)
state_distances <- distances(g_states, to = "NY", algorithm = "unweighted")
## Warning in distances(g_states, to = "NY", algorithm = "unweighted"):
## Unweighted algorithm chosen, weights ignored
nodes_states <- data_frame(name = rownames(state_distances),
distance = state_distances[, 1]) %>%
mutate(color = ifelse(distance == 0, "red",
ifelse(distance == 1, "green",
ifelse(distance == 2, "yellow", "lightblue"))))
g_states <- graph_from_data_frame(edges_states, directed = TRUE,
vertices = nodes_states) %>%
decompose(min.vertices = 3) %>%
`[[`(1)
set.seed(4221)
g_states <- g_states %>% add_layout_(with_graphopt(niter = 4000,
spring.length = 25),
normalize())
par(mar = c(0,0,1,0))
plot(g_states,
edge.width = edge_size_clamp(g_n), edge.arrow.size = 0.5,
edge.arrow.mode = 1,
vertex.size = 5, vertex.label.dist = 0.85, vertex.label.degree = pi)
title("Borrowings between states, number of sections borrowed")